﻿//	Copyright (c) 2012, Michael Kunz. All rights reserved.
//	http://managedcuda.codeplex.com
//
//	This file is part of ManagedCuda.
//
//	ManagedCuda is free software: you can redistribute it and/or modify
//	it under the terms of the GNU Lesser General Public License as 
//	published by the Free Software Foundation, either version 2.1 of the 
//	License, or (at your option) any later version.
//
//	ManagedCuda is distributed in the hope that it will be useful,
//	but WITHOUT ANY WARRANTY; without even the implied warranty of
//	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
//	GNU Lesser General Public License for more details.
//
//	You should have received a copy of the GNU Lesser General Public
//	License along with this library; if not, write to the Free Software
//	Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston,
//	MA 02110-1301  USA, http://www.gnu.org/licenses/.

using System;
using System.Collections;
using System.Collections.Generic;
using System.Text;
using ManagedCuda.BasicTypes;
using ManagedCuda.VectorTypes;
using System.Runtime.InteropServices;
using System.Diagnostics;

namespace ManagedCuda
{
	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: byte
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_byte: IDisposable, IEnumerable<byte>
	{
		IntPtr _intPtr;
		byte* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_byte(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(byte));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(byte)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (byte*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_byte(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_byte(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(byte)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_byte and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(byte). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_byte(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(byte)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_byte()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public byte this[SizeT x, SizeT y]
        {
            get
            {
				byte* line = (byte*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				byte* line = (byte*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<byte> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<byte> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<byte> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<byte> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<byte> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<byte> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<byte> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<byte> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<byte> IEnumerable<byte>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<byte> enumerator = new CudaPageLockedHostMemory2DEnumerator_byte(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_byte(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_byte
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_byte : IEnumerator<byte>
    {
        private CudaPageLockedHostMemory2D_byte _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_byte(CudaPageLockedHostMemory2D_byte memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public byte Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uchar1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uchar1: IDisposable, IEnumerable<uchar1>
	{
		IntPtr _intPtr;
		uchar1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uchar1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uchar1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uchar1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uchar1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uchar1 this[SizeT x, SizeT y]
        {
            get
            {
				uchar1* line = (uchar1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uchar1* line = (uchar1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uchar1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uchar1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uchar1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uchar1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uchar1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uchar1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uchar1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uchar1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uchar1> IEnumerable<uchar1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uchar1> enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uchar1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uchar1 : IEnumerator<uchar1>
    {
        private CudaPageLockedHostMemory2D_uchar1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uchar1(CudaPageLockedHostMemory2D_uchar1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uchar1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uchar2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uchar2: IDisposable, IEnumerable<uchar2>
	{
		IntPtr _intPtr;
		uchar2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uchar2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uchar2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uchar2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uchar2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uchar2 this[SizeT x, SizeT y]
        {
            get
            {
				uchar2* line = (uchar2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uchar2* line = (uchar2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uchar2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uchar2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uchar2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uchar2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uchar2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uchar2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uchar2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uchar2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uchar2> IEnumerable<uchar2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uchar2> enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uchar2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uchar2 : IEnumerator<uchar2>
    {
        private CudaPageLockedHostMemory2D_uchar2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uchar2(CudaPageLockedHostMemory2D_uchar2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uchar2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uchar3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uchar3: IDisposable, IEnumerable<uchar3>
	{
		IntPtr _intPtr;
		uchar3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uchar3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uchar3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uchar3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uchar3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uchar3 this[SizeT x, SizeT y]
        {
            get
            {
				uchar3* line = (uchar3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uchar3* line = (uchar3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uchar3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uchar3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uchar3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uchar3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uchar3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uchar3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uchar3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uchar3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uchar3> IEnumerable<uchar3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uchar3> enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uchar3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uchar3 : IEnumerator<uchar3>
    {
        private CudaPageLockedHostMemory2D_uchar3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uchar3(CudaPageLockedHostMemory2D_uchar3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uchar3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uchar4
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uchar4: IDisposable, IEnumerable<uchar4>
	{
		IntPtr _intPtr;
		uchar4* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar4(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uchar4));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uchar4)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uchar4*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar4(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uchar4(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar4)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uchar4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uchar4). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uchar4(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uchar4)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uchar4()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uchar4 this[SizeT x, SizeT y]
        {
            get
            {
				uchar4* line = (uchar4*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uchar4* line = (uchar4*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uchar4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uchar4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uchar4> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uchar4> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uchar4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uchar4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uchar4> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uchar4> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uchar4> IEnumerable<uchar4>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uchar4> enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar4(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uchar4(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uchar4
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uchar4 : IEnumerator<uchar4>
    {
        private CudaPageLockedHostMemory2D_uchar4 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uchar4(CudaPageLockedHostMemory2D_uchar4 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uchar4 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: sbyte
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_sbyte: IDisposable, IEnumerable<sbyte>
	{
		IntPtr _intPtr;
		sbyte* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_sbyte(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(sbyte));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(sbyte)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (sbyte*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_sbyte(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_sbyte(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(sbyte)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_sbyte and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(sbyte). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_sbyte(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(sbyte)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_sbyte()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public sbyte this[SizeT x, SizeT y]
        {
            get
            {
				sbyte* line = (sbyte*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				sbyte* line = (sbyte*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<sbyte> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<sbyte> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<sbyte> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<sbyte> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<sbyte> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<sbyte> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<sbyte> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<sbyte> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<sbyte> IEnumerable<sbyte>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<sbyte> enumerator = new CudaPageLockedHostMemory2DEnumerator_sbyte(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_sbyte(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_sbyte
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_sbyte : IEnumerator<sbyte>
    {
        private CudaPageLockedHostMemory2D_sbyte _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_sbyte(CudaPageLockedHostMemory2D_sbyte memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public sbyte Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: char1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_char1: IDisposable, IEnumerable<char1>
	{
		IntPtr _intPtr;
		char1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(char1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(char1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (char1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_char1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public char1 this[SizeT x, SizeT y]
        {
            get
            {
				char1* line = (char1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				char1* line = (char1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<char1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<char1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<char1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<char1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<char1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<char1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<char1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<char1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<char1> IEnumerable<char1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<char1> enumerator = new CudaPageLockedHostMemory2DEnumerator_char1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_char1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_char1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_char1 : IEnumerator<char1>
    {
        private CudaPageLockedHostMemory2D_char1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_char1(CudaPageLockedHostMemory2D_char1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public char1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: char2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_char2: IDisposable, IEnumerable<char2>
	{
		IntPtr _intPtr;
		char2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(char2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(char2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (char2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_char2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public char2 this[SizeT x, SizeT y]
        {
            get
            {
				char2* line = (char2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				char2* line = (char2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<char2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<char2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<char2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<char2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<char2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<char2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<char2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<char2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<char2> IEnumerable<char2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<char2> enumerator = new CudaPageLockedHostMemory2DEnumerator_char2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_char2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_char2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_char2 : IEnumerator<char2>
    {
        private CudaPageLockedHostMemory2D_char2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_char2(CudaPageLockedHostMemory2D_char2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public char2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: char3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_char3: IDisposable, IEnumerable<char3>
	{
		IntPtr _intPtr;
		char3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(char3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(char3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (char3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_char3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public char3 this[SizeT x, SizeT y]
        {
            get
            {
				char3* line = (char3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				char3* line = (char3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<char3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<char3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<char3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<char3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<char3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<char3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<char3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<char3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<char3> IEnumerable<char3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<char3> enumerator = new CudaPageLockedHostMemory2DEnumerator_char3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_char3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_char3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_char3 : IEnumerator<char3>
    {
        private CudaPageLockedHostMemory2D_char3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_char3(CudaPageLockedHostMemory2D_char3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public char3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: char4
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_char4: IDisposable, IEnumerable<char4>
	{
		IntPtr _intPtr;
		char4* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char4(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(char4));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(char4)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (char4*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char4(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_char4(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char4)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_char4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(char4). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_char4(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(char4)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_char4()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public char4 this[SizeT x, SizeT y]
        {
            get
            {
				char4* line = (char4*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				char4* line = (char4*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<char4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<char4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<char4> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<char4> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<char4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<char4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<char4> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<char4> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<char4> IEnumerable<char4>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<char4> enumerator = new CudaPageLockedHostMemory2DEnumerator_char4(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_char4(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_char4
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_char4 : IEnumerator<char4>
    {
        private CudaPageLockedHostMemory2D_char4 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_char4(CudaPageLockedHostMemory2D_char4 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public char4 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: short
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_short: IDisposable, IEnumerable<short>
	{
		IntPtr _intPtr;
		short* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(short));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(short)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (short*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_short()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public short this[SizeT x, SizeT y]
        {
            get
            {
				short* line = (short*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				short* line = (short*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<short> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<short> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<short> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<short> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<short> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<short> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<short> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<short> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<short> IEnumerable<short>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<short> enumerator = new CudaPageLockedHostMemory2DEnumerator_short(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_short(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_short
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_short : IEnumerator<short>
    {
        private CudaPageLockedHostMemory2D_short _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_short(CudaPageLockedHostMemory2D_short memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public short Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: short1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_short1: IDisposable, IEnumerable<short1>
	{
		IntPtr _intPtr;
		short1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(short1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(short1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (short1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_short1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public short1 this[SizeT x, SizeT y]
        {
            get
            {
				short1* line = (short1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				short1* line = (short1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<short1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<short1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<short1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<short1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<short1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<short1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<short1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<short1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<short1> IEnumerable<short1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<short1> enumerator = new CudaPageLockedHostMemory2DEnumerator_short1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_short1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_short1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_short1 : IEnumerator<short1>
    {
        private CudaPageLockedHostMemory2D_short1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_short1(CudaPageLockedHostMemory2D_short1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public short1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: short2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_short2: IDisposable, IEnumerable<short2>
	{
		IntPtr _intPtr;
		short2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(short2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(short2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (short2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_short2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public short2 this[SizeT x, SizeT y]
        {
            get
            {
				short2* line = (short2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				short2* line = (short2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<short2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<short2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<short2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<short2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<short2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<short2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<short2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<short2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<short2> IEnumerable<short2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<short2> enumerator = new CudaPageLockedHostMemory2DEnumerator_short2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_short2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_short2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_short2 : IEnumerator<short2>
    {
        private CudaPageLockedHostMemory2D_short2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_short2(CudaPageLockedHostMemory2D_short2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public short2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: short3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_short3: IDisposable, IEnumerable<short3>
	{
		IntPtr _intPtr;
		short3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(short3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(short3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (short3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_short3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public short3 this[SizeT x, SizeT y]
        {
            get
            {
				short3* line = (short3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				short3* line = (short3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<short3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<short3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<short3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<short3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<short3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<short3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<short3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<short3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<short3> IEnumerable<short3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<short3> enumerator = new CudaPageLockedHostMemory2DEnumerator_short3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_short3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_short3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_short3 : IEnumerator<short3>
    {
        private CudaPageLockedHostMemory2D_short3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_short3(CudaPageLockedHostMemory2D_short3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public short3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: short4
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_short4: IDisposable, IEnumerable<short4>
	{
		IntPtr _intPtr;
		short4* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short4(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(short4));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(short4)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (short4*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short4(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_short4(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short4)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_short4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(short4). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_short4(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(short4)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_short4()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public short4 this[SizeT x, SizeT y]
        {
            get
            {
				short4* line = (short4*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				short4* line = (short4*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<short4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<short4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<short4> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<short4> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<short4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<short4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<short4> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<short4> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<short4> IEnumerable<short4>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<short4> enumerator = new CudaPageLockedHostMemory2DEnumerator_short4(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_short4(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_short4
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_short4 : IEnumerator<short4>
    {
        private CudaPageLockedHostMemory2D_short4 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_short4(CudaPageLockedHostMemory2D_short4 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public short4 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ushort
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ushort: IDisposable, IEnumerable<ushort>
	{
		IntPtr _intPtr;
		ushort* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ushort));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ushort)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ushort*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ushort()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ushort this[SizeT x, SizeT y]
        {
            get
            {
				ushort* line = (ushort*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ushort* line = (ushort*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ushort> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ushort> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ushort> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ushort> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ushort> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ushort> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ushort> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ushort> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ushort> IEnumerable<ushort>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ushort> enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ushort
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ushort : IEnumerator<ushort>
    {
        private CudaPageLockedHostMemory2D_ushort _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ushort(CudaPageLockedHostMemory2D_ushort memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ushort Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ushort1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ushort1: IDisposable, IEnumerable<ushort1>
	{
		IntPtr _intPtr;
		ushort1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ushort1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ushort1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ushort1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ushort1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ushort1 this[SizeT x, SizeT y]
        {
            get
            {
				ushort1* line = (ushort1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ushort1* line = (ushort1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ushort1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ushort1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ushort1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ushort1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ushort1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ushort1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ushort1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ushort1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ushort1> IEnumerable<ushort1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ushort1> enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ushort1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ushort1 : IEnumerator<ushort1>
    {
        private CudaPageLockedHostMemory2D_ushort1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ushort1(CudaPageLockedHostMemory2D_ushort1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ushort1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ushort2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ushort2: IDisposable, IEnumerable<ushort2>
	{
		IntPtr _intPtr;
		ushort2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ushort2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ushort2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ushort2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ushort2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ushort2 this[SizeT x, SizeT y]
        {
            get
            {
				ushort2* line = (ushort2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ushort2* line = (ushort2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ushort2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ushort2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ushort2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ushort2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ushort2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ushort2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ushort2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ushort2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ushort2> IEnumerable<ushort2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ushort2> enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ushort2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ushort2 : IEnumerator<ushort2>
    {
        private CudaPageLockedHostMemory2D_ushort2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ushort2(CudaPageLockedHostMemory2D_ushort2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ushort2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ushort3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ushort3: IDisposable, IEnumerable<ushort3>
	{
		IntPtr _intPtr;
		ushort3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ushort3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ushort3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ushort3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ushort3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ushort3 this[SizeT x, SizeT y]
        {
            get
            {
				ushort3* line = (ushort3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ushort3* line = (ushort3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ushort3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ushort3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ushort3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ushort3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ushort3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ushort3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ushort3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ushort3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ushort3> IEnumerable<ushort3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ushort3> enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ushort3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ushort3 : IEnumerator<ushort3>
    {
        private CudaPageLockedHostMemory2D_ushort3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ushort3(CudaPageLockedHostMemory2D_ushort3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ushort3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ushort4
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ushort4: IDisposable, IEnumerable<ushort4>
	{
		IntPtr _intPtr;
		ushort4* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort4(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ushort4));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ushort4)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ushort4*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort4(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ushort4(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort4)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ushort4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ushort4). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ushort4(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ushort4)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ushort4()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ushort4 this[SizeT x, SizeT y]
        {
            get
            {
				ushort4* line = (ushort4*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ushort4* line = (ushort4*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ushort4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ushort4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ushort4> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ushort4> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ushort4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ushort4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ushort4> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ushort4> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ushort4> IEnumerable<ushort4>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ushort4> enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort4(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ushort4(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ushort4
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ushort4 : IEnumerator<ushort4>
    {
        private CudaPageLockedHostMemory2D_ushort4 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ushort4(CudaPageLockedHostMemory2D_ushort4 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ushort4 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: int
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_int: IDisposable, IEnumerable<int>
	{
		IntPtr _intPtr;
		int* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(int));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(int)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (int*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_int()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public int this[SizeT x, SizeT y]
        {
            get
            {
				int* line = (int*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				int* line = (int*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<int> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<int> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<int> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<int> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<int> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<int> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<int> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<int> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<int> IEnumerable<int>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<int> enumerator = new CudaPageLockedHostMemory2DEnumerator_int(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_int(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_int
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_int : IEnumerator<int>
    {
        private CudaPageLockedHostMemory2D_int _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_int(CudaPageLockedHostMemory2D_int memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public int Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: int1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_int1: IDisposable, IEnumerable<int1>
	{
		IntPtr _intPtr;
		int1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(int1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(int1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (int1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_int1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public int1 this[SizeT x, SizeT y]
        {
            get
            {
				int1* line = (int1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				int1* line = (int1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<int1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<int1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<int1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<int1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<int1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<int1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<int1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<int1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<int1> IEnumerable<int1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<int1> enumerator = new CudaPageLockedHostMemory2DEnumerator_int1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_int1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_int1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_int1 : IEnumerator<int1>
    {
        private CudaPageLockedHostMemory2D_int1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_int1(CudaPageLockedHostMemory2D_int1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public int1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: int2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_int2: IDisposable, IEnumerable<int2>
	{
		IntPtr _intPtr;
		int2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(int2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(int2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (int2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_int2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public int2 this[SizeT x, SizeT y]
        {
            get
            {
				int2* line = (int2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				int2* line = (int2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<int2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<int2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<int2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<int2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<int2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<int2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<int2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<int2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<int2> IEnumerable<int2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<int2> enumerator = new CudaPageLockedHostMemory2DEnumerator_int2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_int2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_int2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_int2 : IEnumerator<int2>
    {
        private CudaPageLockedHostMemory2D_int2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_int2(CudaPageLockedHostMemory2D_int2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public int2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: int3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_int3: IDisposable, IEnumerable<int3>
	{
		IntPtr _intPtr;
		int3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(int3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(int3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (int3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_int3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public int3 this[SizeT x, SizeT y]
        {
            get
            {
				int3* line = (int3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				int3* line = (int3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<int3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<int3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<int3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<int3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<int3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<int3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<int3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<int3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<int3> IEnumerable<int3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<int3> enumerator = new CudaPageLockedHostMemory2DEnumerator_int3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_int3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_int3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_int3 : IEnumerator<int3>
    {
        private CudaPageLockedHostMemory2D_int3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_int3(CudaPageLockedHostMemory2D_int3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public int3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: int4
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_int4: IDisposable, IEnumerable<int4>
	{
		IntPtr _intPtr;
		int4* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int4(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(int4));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(int4)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (int4*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int4(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_int4(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int4)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_int4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(int4). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_int4(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(int4)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_int4()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public int4 this[SizeT x, SizeT y]
        {
            get
            {
				int4* line = (int4*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				int4* line = (int4*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<int4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<int4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<int4> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<int4> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<int4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<int4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<int4> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<int4> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<int4> IEnumerable<int4>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<int4> enumerator = new CudaPageLockedHostMemory2DEnumerator_int4(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_int4(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_int4
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_int4 : IEnumerator<int4>
    {
        private CudaPageLockedHostMemory2D_int4 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_int4(CudaPageLockedHostMemory2D_int4 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public int4 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uint
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uint: IDisposable, IEnumerable<uint>
	{
		IntPtr _intPtr;
		uint* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uint));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uint)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uint*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uint()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uint this[SizeT x, SizeT y]
        {
            get
            {
				uint* line = (uint*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uint* line = (uint*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uint> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uint> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uint> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uint> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uint> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uint> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uint> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uint> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uint> IEnumerable<uint>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uint> enumerator = new CudaPageLockedHostMemory2DEnumerator_uint(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uint(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uint
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uint : IEnumerator<uint>
    {
        private CudaPageLockedHostMemory2D_uint _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uint(CudaPageLockedHostMemory2D_uint memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uint Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uint1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uint1: IDisposable, IEnumerable<uint1>
	{
		IntPtr _intPtr;
		uint1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uint1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uint1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uint1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uint1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uint1 this[SizeT x, SizeT y]
        {
            get
            {
				uint1* line = (uint1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uint1* line = (uint1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uint1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uint1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uint1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uint1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uint1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uint1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uint1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uint1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uint1> IEnumerable<uint1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uint1> enumerator = new CudaPageLockedHostMemory2DEnumerator_uint1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uint1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uint1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uint1 : IEnumerator<uint1>
    {
        private CudaPageLockedHostMemory2D_uint1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uint1(CudaPageLockedHostMemory2D_uint1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uint1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uint2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uint2: IDisposable, IEnumerable<uint2>
	{
		IntPtr _intPtr;
		uint2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uint2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uint2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uint2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uint2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uint2 this[SizeT x, SizeT y]
        {
            get
            {
				uint2* line = (uint2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uint2* line = (uint2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uint2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uint2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uint2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uint2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uint2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uint2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uint2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uint2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uint2> IEnumerable<uint2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uint2> enumerator = new CudaPageLockedHostMemory2DEnumerator_uint2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uint2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uint2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uint2 : IEnumerator<uint2>
    {
        private CudaPageLockedHostMemory2D_uint2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uint2(CudaPageLockedHostMemory2D_uint2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uint2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uint3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uint3: IDisposable, IEnumerable<uint3>
	{
		IntPtr _intPtr;
		uint3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uint3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uint3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uint3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uint3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uint3 this[SizeT x, SizeT y]
        {
            get
            {
				uint3* line = (uint3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uint3* line = (uint3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uint3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uint3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uint3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uint3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uint3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uint3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uint3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uint3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uint3> IEnumerable<uint3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uint3> enumerator = new CudaPageLockedHostMemory2DEnumerator_uint3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uint3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uint3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uint3 : IEnumerator<uint3>
    {
        private CudaPageLockedHostMemory2D_uint3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uint3(CudaPageLockedHostMemory2D_uint3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uint3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: uint4
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_uint4: IDisposable, IEnumerable<uint4>
	{
		IntPtr _intPtr;
		uint4* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint4(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(uint4));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(uint4)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (uint4*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint4(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_uint4(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint4)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_uint4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(uint4). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_uint4(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(uint4)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_uint4()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public uint4 this[SizeT x, SizeT y]
        {
            get
            {
				uint4* line = (uint4*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				uint4* line = (uint4*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<uint4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<uint4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<uint4> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<uint4> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<uint4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<uint4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<uint4> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<uint4> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<uint4> IEnumerable<uint4>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<uint4> enumerator = new CudaPageLockedHostMemory2DEnumerator_uint4(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_uint4(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_uint4
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_uint4 : IEnumerator<uint4>
    {
        private CudaPageLockedHostMemory2D_uint4 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_uint4(CudaPageLockedHostMemory2D_uint4 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public uint4 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: long
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_long: IDisposable, IEnumerable<long>
	{
		IntPtr _intPtr;
		long* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_long(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(long));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(long)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (long*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_long(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_long(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(long)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(long). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_long(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(long)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_long()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public long this[SizeT x, SizeT y]
        {
            get
            {
				long* line = (long*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				long* line = (long*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<long> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<long> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<long> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<long> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<long> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<long> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<long> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<long> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<long> IEnumerable<long>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<long> enumerator = new CudaPageLockedHostMemory2DEnumerator_long(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_long(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_long
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_long : IEnumerator<long>
    {
        private CudaPageLockedHostMemory2D_long _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_long(CudaPageLockedHostMemory2D_long memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public long Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: long1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_long1: IDisposable, IEnumerable<long1>
	{
		IntPtr _intPtr;
		long1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_long1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(long1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(long1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (long1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_long1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_long1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(long1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(long1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_long1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(long1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_long1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public long1 this[SizeT x, SizeT y]
        {
            get
            {
				long1* line = (long1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				long1* line = (long1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<long1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<long1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<long1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<long1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<long1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<long1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<long1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<long1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<long1> IEnumerable<long1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<long1> enumerator = new CudaPageLockedHostMemory2DEnumerator_long1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_long1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_long1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_long1 : IEnumerator<long1>
    {
        private CudaPageLockedHostMemory2D_long1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_long1(CudaPageLockedHostMemory2D_long1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public long1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: long2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_long2: IDisposable, IEnumerable<long2>
	{
		IntPtr _intPtr;
		long2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_long2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(long2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(long2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (long2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_long2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_long2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(long2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_long2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(long2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_long2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(long2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_long2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public long2 this[SizeT x, SizeT y]
        {
            get
            {
				long2* line = (long2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				long2* line = (long2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<long2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<long2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<long2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<long2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<long2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<long2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<long2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<long2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<long2> IEnumerable<long2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<long2> enumerator = new CudaPageLockedHostMemory2DEnumerator_long2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_long2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_long2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_long2 : IEnumerator<long2>
    {
        private CudaPageLockedHostMemory2D_long2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_long2(CudaPageLockedHostMemory2D_long2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public long2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ulong
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ulong: IDisposable, IEnumerable<ulong>
	{
		IntPtr _intPtr;
		ulong* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ulong(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ulong));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ulong)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ulong*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ulong(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ulong(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ulong)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ulong). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ulong(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ulong)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ulong()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ulong this[SizeT x, SizeT y]
        {
            get
            {
				ulong* line = (ulong*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ulong* line = (ulong*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ulong> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ulong> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ulong> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ulong> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ulong> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ulong> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ulong> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ulong> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ulong> IEnumerable<ulong>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ulong> enumerator = new CudaPageLockedHostMemory2DEnumerator_ulong(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ulong(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ulong
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ulong : IEnumerator<ulong>
    {
        private CudaPageLockedHostMemory2D_ulong _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ulong(CudaPageLockedHostMemory2D_ulong memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ulong Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ulong1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ulong1: IDisposable, IEnumerable<ulong1>
	{
		IntPtr _intPtr;
		ulong1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ulong1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ulong1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ulong1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ulong1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ulong1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ulong1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ulong1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ulong1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ulong1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ulong1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ulong1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ulong1 this[SizeT x, SizeT y]
        {
            get
            {
				ulong1* line = (ulong1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ulong1* line = (ulong1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ulong1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ulong1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ulong1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ulong1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ulong1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ulong1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ulong1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ulong1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ulong1> IEnumerable<ulong1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ulong1> enumerator = new CudaPageLockedHostMemory2DEnumerator_ulong1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ulong1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ulong1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ulong1 : IEnumerator<ulong1>
    {
        private CudaPageLockedHostMemory2D_ulong1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ulong1(CudaPageLockedHostMemory2D_ulong1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ulong1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: ulong2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_ulong2: IDisposable, IEnumerable<ulong2>
	{
		IntPtr _intPtr;
		ulong2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ulong2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(ulong2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(ulong2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (ulong2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ulong2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_ulong2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ulong2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_ulong2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(ulong2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_ulong2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(ulong2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_ulong2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public ulong2 this[SizeT x, SizeT y]
        {
            get
            {
				ulong2* line = (ulong2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				ulong2* line = (ulong2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<ulong2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<ulong2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<ulong2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<ulong2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<ulong2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<ulong2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<ulong2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<ulong2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<ulong2> IEnumerable<ulong2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<ulong2> enumerator = new CudaPageLockedHostMemory2DEnumerator_ulong2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_ulong2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_ulong2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_ulong2 : IEnumerator<ulong2>
    {
        private CudaPageLockedHostMemory2D_ulong2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_ulong2(CudaPageLockedHostMemory2D_ulong2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public ulong2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: float
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_float: IDisposable, IEnumerable<float>
	{
		IntPtr _intPtr;
		float* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(float));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(float)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (float*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_float()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public float this[SizeT x, SizeT y]
        {
            get
            {
				float* line = (float*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				float* line = (float*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<float> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<float> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<float> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<float> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<float> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<float> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<float> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<float> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<float> IEnumerable<float>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<float> enumerator = new CudaPageLockedHostMemory2DEnumerator_float(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_float(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_float
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_float : IEnumerator<float>
    {
        private CudaPageLockedHostMemory2D_float _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_float(CudaPageLockedHostMemory2D_float memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public float Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: float1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_float1: IDisposable, IEnumerable<float1>
	{
		IntPtr _intPtr;
		float1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(float1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(float1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (float1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_float1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public float1 this[SizeT x, SizeT y]
        {
            get
            {
				float1* line = (float1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				float1* line = (float1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<float1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<float1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<float1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<float1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<float1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<float1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<float1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<float1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<float1> IEnumerable<float1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<float1> enumerator = new CudaPageLockedHostMemory2DEnumerator_float1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_float1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_float1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_float1 : IEnumerator<float1>
    {
        private CudaPageLockedHostMemory2D_float1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_float1(CudaPageLockedHostMemory2D_float1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public float1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: float2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_float2: IDisposable, IEnumerable<float2>
	{
		IntPtr _intPtr;
		float2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(float2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(float2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (float2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_float2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public float2 this[SizeT x, SizeT y]
        {
            get
            {
				float2* line = (float2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				float2* line = (float2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<float2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<float2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<float2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<float2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<float2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<float2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<float2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<float2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<float2> IEnumerable<float2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<float2> enumerator = new CudaPageLockedHostMemory2DEnumerator_float2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_float2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_float2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_float2 : IEnumerator<float2>
    {
        private CudaPageLockedHostMemory2D_float2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_float2(CudaPageLockedHostMemory2D_float2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public float2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: float3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_float3: IDisposable, IEnumerable<float3>
	{
		IntPtr _intPtr;
		float3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(float3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(float3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (float3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_float3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public float3 this[SizeT x, SizeT y]
        {
            get
            {
				float3* line = (float3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				float3* line = (float3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<float3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<float3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<float3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<float3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<float3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<float3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<float3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<float3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<float3> IEnumerable<float3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<float3> enumerator = new CudaPageLockedHostMemory2DEnumerator_float3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_float3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_float3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_float3 : IEnumerator<float3>
    {
        private CudaPageLockedHostMemory2D_float3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_float3(CudaPageLockedHostMemory2D_float3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public float3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: float4
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_float4: IDisposable, IEnumerable<float4>
	{
		IntPtr _intPtr;
		float4* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float4(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(float4));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(float4)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (float4*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float4(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_float4(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float4)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_float4 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(float4). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_float4(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(float4)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_float4()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public float4 this[SizeT x, SizeT y]
        {
            get
            {
				float4* line = (float4*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				float4* line = (float4*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<float4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<float4> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<float4> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<float4> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<float4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<float4> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<float4> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<float4> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<float4> IEnumerable<float4>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<float4> enumerator = new CudaPageLockedHostMemory2DEnumerator_float4(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_float4(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_float4
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_float4 : IEnumerator<float4>
    {
        private CudaPageLockedHostMemory2D_float4 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_float4(CudaPageLockedHostMemory2D_float4 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public float4 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: double
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_double: IDisposable, IEnumerable<double>
	{
		IntPtr _intPtr;
		double* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_double(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(double));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(double)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (double*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_double(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_double(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(double)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(double). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_double(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(double)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_double()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public double this[SizeT x, SizeT y]
        {
            get
            {
				double* line = (double*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				double* line = (double*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<double> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<double> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<double> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<double> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<double> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<double> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<double> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<double> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<double> IEnumerable<double>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<double> enumerator = new CudaPageLockedHostMemory2DEnumerator_double(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_double(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_double
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_double : IEnumerator<double>
    {
        private CudaPageLockedHostMemory2D_double _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_double(CudaPageLockedHostMemory2D_double memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public double Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: double1
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_double1: IDisposable, IEnumerable<double1>
	{
		IntPtr _intPtr;
		double1* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_double1(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(double1));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(double1)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (double1*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_double1(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_double1(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(double1)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double1 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(double1). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_double1(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(double1)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_double1()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public double1 this[SizeT x, SizeT y]
        {
            get
            {
				double1* line = (double1*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				double1* line = (double1*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<double1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<double1> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<double1> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<double1> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<double1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<double1> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<double1> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<double1> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<double1> IEnumerable<double1>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<double1> enumerator = new CudaPageLockedHostMemory2DEnumerator_double1(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_double1(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_double1
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_double1 : IEnumerator<double1>
    {
        private CudaPageLockedHostMemory2D_double1 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_double1(CudaPageLockedHostMemory2D_double1 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public double1 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: double2
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_double2: IDisposable, IEnumerable<double2>
	{
		IntPtr _intPtr;
		double2* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_double2(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(double2));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(double2)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (double2*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_double2(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_double2(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(double2)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_double2 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(double2). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_double2(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(double2)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_double2()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public double2 this[SizeT x, SizeT y]
        {
            get
            {
				double2* line = (double2*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				double2* line = (double2*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<double2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<double2> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<double2> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<double2> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<double2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<double2> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<double2> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<double2> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<double2> IEnumerable<double2>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<double2> enumerator = new CudaPageLockedHostMemory2DEnumerator_double2(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_double2(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_double2
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_double2 : IEnumerator<double2>
    {
        private CudaPageLockedHostMemory2D_double2 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_double2(CudaPageLockedHostMemory2D_double2 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public double2 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: cuDoubleComplex
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_cuDoubleComplex: IDisposable, IEnumerable<cuDoubleComplex>
	{
		IntPtr _intPtr;
		cuDoubleComplex* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuDoubleComplex(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(cuDoubleComplex));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(cuDoubleComplex)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (cuDoubleComplex*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuDoubleComplex(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuDoubleComplex(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuDoubleComplex)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleComplex and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuDoubleComplex). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuDoubleComplex(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuDoubleComplex)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_cuDoubleComplex()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public cuDoubleComplex this[SizeT x, SizeT y]
        {
            get
            {
				cuDoubleComplex* line = (cuDoubleComplex*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				cuDoubleComplex* line = (cuDoubleComplex*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<cuDoubleComplex> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<cuDoubleComplex> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<cuDoubleComplex> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<cuDoubleComplex> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<cuDoubleComplex> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<cuDoubleComplex> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<cuDoubleComplex> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<cuDoubleComplex> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<cuDoubleComplex> IEnumerable<cuDoubleComplex>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<cuDoubleComplex> enumerator = new CudaPageLockedHostMemory2DEnumerator_cuDoubleComplex(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_cuDoubleComplex(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_cuDoubleComplex
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_cuDoubleComplex : IEnumerator<cuDoubleComplex>
    {
        private CudaPageLockedHostMemory2D_cuDoubleComplex _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_cuDoubleComplex(CudaPageLockedHostMemory2D_cuDoubleComplex memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public cuDoubleComplex Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: cuDoubleReal
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_cuDoubleReal: IDisposable, IEnumerable<cuDoubleReal>
	{
		IntPtr _intPtr;
		cuDoubleReal* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuDoubleReal(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(cuDoubleReal));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(cuDoubleReal)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (cuDoubleReal*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuDoubleReal(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuDoubleReal(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuDoubleReal)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuDoubleReal and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuDoubleReal). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuDoubleReal(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuDoubleReal)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_cuDoubleReal()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public cuDoubleReal this[SizeT x, SizeT y]
        {
            get
            {
				cuDoubleReal* line = (cuDoubleReal*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				cuDoubleReal* line = (cuDoubleReal*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<cuDoubleReal> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<cuDoubleReal> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<cuDoubleReal> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<cuDoubleReal> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<cuDoubleReal> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<cuDoubleReal> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<cuDoubleReal> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<cuDoubleReal> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<cuDoubleReal> IEnumerable<cuDoubleReal>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<cuDoubleReal> enumerator = new CudaPageLockedHostMemory2DEnumerator_cuDoubleReal(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_cuDoubleReal(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_cuDoubleReal
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_cuDoubleReal : IEnumerator<cuDoubleReal>
    {
        private CudaPageLockedHostMemory2D_cuDoubleReal _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_cuDoubleReal(CudaPageLockedHostMemory2D_cuDoubleReal memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public cuDoubleReal Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: cuFloatComplex
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_cuFloatComplex: IDisposable, IEnumerable<cuFloatComplex>
	{
		IntPtr _intPtr;
		cuFloatComplex* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuFloatComplex(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(cuFloatComplex));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(cuFloatComplex)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (cuFloatComplex*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuFloatComplex(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuFloatComplex(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuFloatComplex)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatComplex and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuFloatComplex). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuFloatComplex(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuFloatComplex)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_cuFloatComplex()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public cuFloatComplex this[SizeT x, SizeT y]
        {
            get
            {
				cuFloatComplex* line = (cuFloatComplex*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				cuFloatComplex* line = (cuFloatComplex*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<cuFloatComplex> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<cuFloatComplex> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<cuFloatComplex> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<cuFloatComplex> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<cuFloatComplex> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<cuFloatComplex> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<cuFloatComplex> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<cuFloatComplex> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<cuFloatComplex> IEnumerable<cuFloatComplex>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<cuFloatComplex> enumerator = new CudaPageLockedHostMemory2DEnumerator_cuFloatComplex(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_cuFloatComplex(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_cuFloatComplex
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_cuFloatComplex : IEnumerator<cuFloatComplex>
    {
        private CudaPageLockedHostMemory2D_cuFloatComplex _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_cuFloatComplex(CudaPageLockedHostMemory2D_cuFloatComplex memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public cuFloatComplex Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: cuFloatReal
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_cuFloatReal: IDisposable, IEnumerable<cuFloatReal>
	{
		IntPtr _intPtr;
		cuFloatReal* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuFloatReal(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(cuFloatReal));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(cuFloatReal)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (cuFloatReal*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuFloatReal(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_cuFloatReal(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuFloatReal)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_cuFloatReal and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(cuFloatReal). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_cuFloatReal(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(cuFloatReal)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_cuFloatReal()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public cuFloatReal this[SizeT x, SizeT y]
        {
            get
            {
				cuFloatReal* line = (cuFloatReal*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				cuFloatReal* line = (cuFloatReal*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<cuFloatReal> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<cuFloatReal> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<cuFloatReal> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<cuFloatReal> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<cuFloatReal> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<cuFloatReal> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<cuFloatReal> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<cuFloatReal> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<cuFloatReal> IEnumerable<cuFloatReal>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<cuFloatReal> enumerator = new CudaPageLockedHostMemory2DEnumerator_cuFloatReal(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_cuFloatReal(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_cuFloatReal
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_cuFloatReal : IEnumerator<cuFloatReal>
    {
        private CudaPageLockedHostMemory2D_cuFloatReal _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_cuFloatReal(CudaPageLockedHostMemory2D_cuFloatReal memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public cuFloatReal Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
	/// <summary>
	/// A variable located in page locked (pinned) host memory. Use this type of variabe for asynchronous memcpy.<para/>
	/// Type: dim3
	/// </summary>
	public unsafe class CudaPageLockedHostMemory2D_dim3: IDisposable, IEnumerable<dim3>
	{
		IntPtr _intPtr;
		dim3* _ptr;
        SizeT _sizeInBytes = 0;
        SizeT _width = 0;
        SizeT _pitchInBytes = 0;
        SizeT _height = 0;
        SizeT _typeSize = 0;
        CUResult res;
        bool disposed;

        #region Constructor
        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_dim3(SizeT width, SizeT pitchInBytes, SizeT height, CUMemHostAllocFlags allocFlags)
        {
            _intPtr = new IntPtr();
            _width = width;
            _pitchInBytes = pitchInBytes;
            _height = height;
            _typeSize = (SizeT)Marshal.SizeOf(typeof(dim3));
            _sizeInBytes = _pitchInBytes * _height;

            if (_typeSize * width > _pitchInBytes)
                throw new ArgumentException("pitchInBytes must be greater or equal to width * sizeof(dim3)", "pitchInBytes");

            res = DriverAPINativeMethods.MemoryManagement.cuMemHostAlloc(ref _intPtr, _sizeInBytes, allocFlags);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostAlloc", res));
			_ptr = (dim3*) _intPtr;
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="pitchInBytes">Width including alignment in bytes</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_dim3(SizeT width, SizeT pitchInBytes, SizeT height)
            : this(width, pitchInBytes, height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc without flags.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        public CudaPageLockedHostMemory2D_dim3(SizeT width, SizeT height)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(dim3)), height, 0)
        {

        }

        /// <summary>
        /// Creates a new CudaPageLockedHostMemory2D_dim3 and allocates the memory on host. Using cuMemHostAlloc without flags.<para/>
        /// Pitch is assumed to be width * sizeof(dim3). Using cuMemHostAlloc.
        /// </summary>
        /// <param name="width">In elements</param>
        /// <param name="height">In elements</param>
        /// <param name="allocFlags"></param>
        public CudaPageLockedHostMemory2D_dim3(SizeT width, SizeT height, CUMemHostAllocFlags allocFlags)
            : this(width, width * (SizeT)Marshal.SizeOf(typeof(dim3)), height, allocFlags)
        {

        }

        /// <summary>
        /// For dispose
        /// </summary>
        ~CudaPageLockedHostMemory2D_dim3()
        {
            Dispose(false);
        }
        #endregion

        #region Dispose
        /// <summary>
        /// Dispose
        /// </summary>
        public void Dispose()
        {
            Dispose(true);
            GC.SuppressFinalize(this);
        }

        /// <summary>
        /// For IDisposable
        /// </summary>
        /// <param name="fDisposing"></param>
        protected virtual void Dispose(bool fDisposing)
        {
            if (fDisposing && !disposed)
            {
                res = DriverAPINativeMethods.MemoryManagement.cuMemFreeHost(_intPtr);
                Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemFreeHost", res));
                disposed = true;
            }
            if (!fDisposing && !disposed)
                Debug.WriteLine(String.Format("ManagedCUDA not-disposed warning: {0}", this.GetType()));
        }
        #endregion

        #region Properties
        /// <summary>
        /// Pointer to pinned host memory.
        /// </summary>
        public IntPtr PinnedHostPointer
        {
            get { return _intPtr; }
        }

        /// <summary>
        /// Width in elements
        /// </summary>
        public SizeT Width
        {
            get { return _width; }
        }

        /// <summary>
        /// Height in elements
        /// </summary>
        public SizeT Height
        {
            get { return _height; }
        }

        /// <summary>
        /// Pitch in bytes
        /// </summary>
        public SizeT Pitch
        {
            get { return _pitchInBytes; }
        }

        /// <summary>
        /// Size in bytes
        /// </summary>
        public SizeT SizeInBytes
        {
            get { return _sizeInBytes; }
        }

        /// <summary>
        /// Type size in bytes
        /// </summary>
        public SizeT TypeSize
        {
            get { return _typeSize; }
        }

        /// <summary>
        /// Access array per element.
        /// </summary>
        /// <param name="x">X-index in elements</param>
        /// <param name="y">Y-index in elements</param>
        /// <returns></returns>
        public dim3 this[SizeT x, SizeT y]
        {
            get
            {
				dim3* line = (dim3*)(((byte*)_ptr) + _pitchInBytes * y);
				return line[x];
            }
            set
            {
				dim3* line = (dim3*)(((byte*)_ptr) + _pitchInBytes * y);
				line[x] = value;                
            }
        }
        #endregion

        #region Synchron Copy Methods
        #region Array2D
        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyToArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyToArray2D(CudaArray2D array)
        {
            SynchronCopyToArray2D(array.CUArray);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        public void SynchronCopyFromArray2D(CUarray deviceArray)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        public void SynchronCopyFromArray2D(CudaArray2D array)
        {
            SynchronCopyFromArray2D(array.CUArray);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToDevice(CudaDeviceVariable<dim3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyHtoD_v2(devicePtr.DevicePointer, this._intPtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoD", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CUdeviceptr devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        public void SynchronCopyToHost(CudaDeviceVariable<dim3> devicePtr)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpyDtoH_v2(this._intPtr, devicePtr.DevicePointer, SizeInBytes);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoH", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyToDevice(CudaPitchedDeviceVariable<dim3> deviceVar)
        {
            SynchronCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        public void SynchronCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.SynchronousMemcpy_v2.cuMemcpy2D_v2(ref cpyProps);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2D_v2", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Synchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        public void SynchronCopyFromDevice(CudaPitchedDeviceVariable<dim3> deviceVar)
        {
            SynchronCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch);
        }
        #endregion
        #endregion

        #region Asynchron Copy Methods
        #region Array2D
        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstArray = deviceArray;
            cpyProps.dstMemoryType = CUMemoryType.Array;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy host to 2D Array
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyToArray2D(array.CUArray, stream);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="deviceArray"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CUarray deviceArray, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcArray = deviceArray;
            cpyProps.srcMemoryType = CUMemoryType.Array;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy 2D Array to host
        /// </summary>
        /// <param name="array"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromArray2D(CudaArray2D array, CUstream stream)
        {
            AsyncCopyFromArray2D(array.CUArray, stream);
        }
        #endregion
        #region DevicePtr
        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaDeviceVariable<dim3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyHtoDAsync_v2(devicePtr.DevicePointer, _intPtr, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyHtoDAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaDeviceVariable<dim3> devicePtr, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpyDtoHAsync_v2(_intPtr, devicePtr.DevicePointer, SizeInBytes, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpyDtoHAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }
        #endregion
        #region PitchedDevicePtr
        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.dstDevice = devicePtr;
            cpyProps.dstMemoryType = CUMemoryType.Device;
            cpyProps.dstPitch = pitchDevice;
            cpyProps.srcHost = _intPtr;
            cpyProps.srcMemoryType = CUMemoryType.Host;
            cpyProps.srcPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron Copy host to pitched device
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyToDevice(CudaPitchedDeviceVariable<dim3> deviceVar, CUstream stream)
        {
            AsyncCopyToDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="devicePtr"></param>
        /// <param name="pitchDevice"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CUdeviceptr devicePtr, SizeT pitchDevice, CUstream stream)
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUDAMemCpy2D cpyProps = new CUDAMemCpy2D();
            cpyProps.srcDevice = devicePtr;
            cpyProps.srcMemoryType = CUMemoryType.Device;
            cpyProps.srcPitch = pitchDevice;
            cpyProps.dstHost = _intPtr;
            cpyProps.dstMemoryType = CUMemoryType.Host;
            cpyProps.dstPitch = _pitchInBytes;
            cpyProps.WidthInBytes = _width * _typeSize;
            cpyProps.Height = _height;

            res = DriverAPINativeMethods.AsynchronousMemcpy_v2.cuMemcpy2DAsync_v2(ref cpyProps, stream);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemcpy2DAsync", res));
            if (res != CUResult.Success) throw new CudaException(res);
        }

        /// <summary>
        /// Asynchron copy device to host
        /// </summary>
        /// <param name="deviceVar"></param>
        /// <param name="stream"></param>
        public void AsyncCopyFromDevice(CudaPitchedDeviceVariable<dim3> deviceVar, CUstream stream)
        {
            AsyncCopyFromDevice(deviceVar.DevicePointer, deviceVar.Pitch, stream);
        }
        #endregion
        #endregion

        #region Methods
        /// <summary>
        /// Returns the CUdeviceptr for pinned host memory mapped to device memory space. Only valid if context is created with flag <see cref="CUCtxFlags.MapHost"/>
        /// </summary>
        /// <returns>Device Pointer</returns>
        public CUdeviceptr GetDevicePointer()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUdeviceptr ptr = new CUdeviceptr();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetDevicePointer_v2(ref ptr, _intPtr, 0);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return ptr;
        }

        /// <summary>
        /// Passes back the flags that were specified when allocating the pinned host buffer
        /// </summary>
        /// <returns></returns>
        public CUMemHostAllocFlags GetAllocFlags()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            CUMemHostAllocFlags flags = new CUMemHostAllocFlags();
            res = DriverAPINativeMethods.MemoryManagement.cuMemHostGetFlags(ref flags, _intPtr);
            Debug.WriteLine(String.Format("{0:G}, {1}: {2}", DateTime.Now, "cuMemHostGetDevicePointer", res));
            if (res != CUResult.Success) throw new CudaException(res);
            return flags;
        }
        #endregion

        #region IEnumerable
        IEnumerator<dim3> IEnumerable<dim3>.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator<dim3> enumerator = new CudaPageLockedHostMemory2DEnumerator_dim3(this);
            return enumerator;
        }

        IEnumerator IEnumerable.GetEnumerator()
        {
            if (disposed) throw new ObjectDisposedException(this.ToString());
            IEnumerator enumerator = new CudaPageLockedHostMemory2DEnumerator_dim3(this);
            return enumerator;
        }

        #endregion
    }

    /// <summary>
    /// Enumerator class for CudaPageLockedHostMemory2D_dim3
    /// </summary>
    public class CudaPageLockedHostMemory2DEnumerator_dim3 : IEnumerator<dim3>
    {
        private CudaPageLockedHostMemory2D_dim3 _memory = null;
        private SizeT _currentX = -1;
        private SizeT _currentY = 0;

        /// <summary>
        /// 
        /// </summary>
        /// <param name="memory"></param>
        public CudaPageLockedHostMemory2DEnumerator_dim3(CudaPageLockedHostMemory2D_dim3 memory)
        {
            _memory = memory;
        }

        void IDisposable.Dispose() { }

        /// <summary>
        /// 
        /// </summary>
        public void Reset()
        {
            _currentX = -1;
            _currentY = 0;
        }

        /// <summary>
        /// 
        /// </summary>
        public dim3 Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        object IEnumerator.Current
        {
            get { return _memory[_currentX, _currentY]; }
        }

        /// <summary>
        /// 
        /// </summary>
        /// <returns></returns>
        public bool MoveNext()
        {
            _currentX+=1;
			if ((long)_currentX >= (long)_memory.Width)
            {
                _currentX = 0;
                _currentY+=1;
            }

			if ((long)_currentY >= (long)_memory.Height)
                return false;
            else
                return true;
        }
    }

	
}
